Kapitel 14 Modals in Slovene Texts

14.1 Programi

library(tidyverse)
library(scales)
library(janitor)
library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(tidytext)
library(readxl)
library(writexl)

14.2 Branje besedil

slovtwit1 <- read_rds("data/tmls_all_slo_politicians_2021-03-21.rds")
head(slovtwit1)
## # A tibble: 6 x 90
##   user_id   status_id           created_at          screen_name text      source
##   <chr>     <chr>               <dttm>              <chr>       <chr>     <chr> 
## 1 258856900 1373313507467403266 2021-03-20 16:40:29 JJansaSDS   "Alojzij~ Twitt~
## 2 258856900 1373313373677424643 2021-03-20 16:39:57 JJansaSDS   "Aleluja~ Twitt~
## 3 258856900 1373308627658928134 2021-03-20 16:21:06 JJansaSDS   "Pozor, ~ Twitt~
## 4 258856900 1373308499942375424 2021-03-20 16:20:35 JJansaSDS   "Ne poza~ Twitt~
## 5 258856900 1373308106206277633 2021-03-20 16:19:02 JJansaSDS   "Mödendo~ Twitt~
## 6 258856900 1373307672087429124 2021-03-20 16:17:18 JJansaSDS   "V četrt~ Twitt~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>, ...
slovtwit <- read_rds("data/tmls_all_slo_politicians_2021-09-18.rds")
head(slovtwit)
## # A tibble: 6 x 90
##   user_id   status_id           created_at          screen_name text      source
##   <chr>     <chr>               <dttm>              <chr>       <chr>     <chr> 
## 1 258856900 1439192528507621376 2021-09-18 11:40:12 JJansaSDS   "V Zagor~ Twitt~
## 2 258856900 1439191956303777797 2021-09-18 11:37:55 JJansaSDS   "Iskrena~ Twitt~
## 3 258856900 1439190521843421185 2021-09-18 11:32:13 JJansaSDS   "Discuss~ Twitt~
## 4 258856900 1439190476091973635 2021-09-18 11:32:03 JJansaSDS   "Že 1995~ Twitt~
## 5 258856900 1439190233589895170 2021-09-18 11:31:05 JJansaSDS   "#Taliba~ Twitt~
## 6 258856900 1439190017390358528 2021-09-18 11:30:13 JJansaSDS   "Tako se~ Twitt~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>, ...
parent_folder<-"data/casniki/"
filenames <- list.files(parent_folder, recursive=TRUE, full.names = T, pattern = "\\.json$")
head(filenames, 3)
## [1] "data/casniki/delo_korona2.json" "data/casniki/delo_mnenja2.json"
## [3] "data/casniki/delo_novice2.json"
# slovbes <- lapply(filenames,load,.GlobalEnv)
slovcas <- readtext(filenames, text_field = "text", encoding = "latin1", verbosity = 0,
                    ignore_missing_files = T)
slovcas
## readtext object consisting of 2674 documents and 4 docvars.
## # Description: df [2,674 x 6]
##   doc_id              text                date       title     intro    url     
##   <chr>               <chr>               <chr>      <chr>     <chr>    <chr>   
## 1 delo_korona2.json.1 "\"Vladni gov\"..." 2020-08-19 Hrvaška ~ Vlada b~ https:/~
## 2 delo_korona2.json.2 "\"Ladja gršk\"..." 2020-08-19 Po zasid~ Kaj se ~ https:/~
## 3 delo_korona2.json.3 "\"V torek so\"..." 2020-08-19 V Sloven~ Aktivno~ https:/~
## 4 delo_korona2.json.4 "\"KLJUČNI PO\"..." 2020-08-19 Na Hrvaš~ V člank~ https:/~
## 5 delo_korona2.json.5 "\"Število ok\"..." 2020-08-18 Kacin: U~ Na Hrva~ https:/~
## 6 delo_korona2.json.6 "\"Nestrpnost\"..." 2020-08-19 Vročina ~ V izbor~ https:/~
## # ... with 2,668 more rows

14.3 Korpus

sltwit_corp <- corpus(slovtwit, text_field = "text")
slcas_corp <- corpus(slovcas)
library(DT)
## Warning: package 'DT' was built under R version 4.1.1
slcas_stat <- summary(slcas_corp, n = 2674)
# DT::datatable(slcas_stat, fillContainer = TRUE, filter = "top",
#               options = list(pageLength = 4))

DT::datatable(head(slcas_stat, 30), 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("Text",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

Ponovno združi statistiko in text v eno podatkovno zbirko.

slcas <- slcas_stat %>% 
  rename(doc_id = Text) %>% 
  left_join(slovcas)
slcas_summary <- textstat_summary(slcas_corp) %>% 
   rename(doc_id = document)
slcas1 <- as_tibble(slcas) %>% 
  left_join(slcas_summary, by = "doc_id")

DT::datatable(head(slcas1, 30), 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("doc_id",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.3.0.1 Zapiši na disk.

# write_csv2(slcas1, "data/slovenski_casniki.csv")

14.4 Besedne oblike in matrika

slcas_toks <- tokens(slcas_corp, remove_numbers = T, remove_punct = T, 
                     remove_symbols = T, remove_url = T)
slcas_dfm <- dfm(slcas_toks)

14.5 Pogostnost oblik

freqs <- textstat_frequency(slcas_dfm) 

DT::datatable(head(freqs, 30), 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("feature",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.6 Konkordance

Za sestavo konkordanc, ki vsebujejo zaželene naklonske zgradbe, uporabljamo funkcijo kwic() programa quanteda. Konkordanco lahko ustvarimo iz jezikovnega gradiva (corpus) ali besednega seznama (tokens). Prednostni način je slednji, saj nam omogoča tudi izločanje nezaželenih izrazov (npr. url, številk idr.).

slcas_toks2 <- tokens(slcas_corp, remove_numbers = T, 
                      remove_punct = FALSE, 
                     remove_symbols = T, remove_url = T)
treba <- kwic(slcas_toks2, pattern = "treba")

S funkcijo as_tibble() pretvorimo konkordanćni seznam v podatkovni niz ali tabelo (prim. Excel). To nam omogoča lepši in preglednejši izpis preglednice in računanje deležev in drugih količin s funkcijami programskega svežnja tidyverse.

treba_kwic <- as_tibble(treba)

DT::datatable(treba_kwic, 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
morati <- kwic(slcas_toks2, 
               pattern = c("morati", "mora", "moram", "moraš",
                           "morava", "morata", "morajo", "moramo",
                           "moral", "morala", "morali", "morale",
                           "moralo"))
morati_kwic <- as_tibble(morati)

DT::datatable(morati_kwic, 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
morati_kwic %>% 
  count(pattern, sort = T) %>% 
  mutate(pattern = fct_reorder(pattern, n)) %>% 
  ggplot(aes(n, pattern, fill = rainbow(12))) +
  geom_col() +
  theme(legend.position = "none")

Next, we calculate the frequency and dispersion of tokens per narrative, which contain the terms ‘dark’ and ‘light.’

term1 <- kwic(slcas_toks2, "treba", valuetype = "regex", case_insensitive = T) %>% 
  group_by(docname) %>% 
  summarise(hits = n()) %>% 
  mutate(percentage = hits/(slcas_stat$Tokens[1:length(hits)]/100), searchterm = "treba") %>%
  arrange(desc(percentage))

term2 <- kwic(slcas_toks2, "mora*", valuetype = "regex", case_insensitive = T) %>%
  group_by(docname) %>% 
  summarise(hits = n()) %>% 
  mutate(percentage = hits/(slcas_stat$Tokens[1:length(hits)]/100), searchterm = "mora*") %>% arrange(desc(percentage))
term1 %>% 
  arrange(-percentage) %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
term2 %>% 
  arrange(-percentage) %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.7 Kolokacije

collo2 <- textstat_collocations(slcas_corp)
collo2 %>% 
  filter(str_detect(collocation, "\\btreba\\b")) %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("collocation",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
collo2 %>% 
  filter(str_detect(collocation, "\\bmora")) %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("collocation",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
collo3 <- textstat_collocations(slcas_corp, size = 3, tolower = T)
collo3 %>% 
  filter(str_detect(collocation, "\\btreba\\b")) %>% 
  arrange(-count) %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("collocation",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
collo3 %>% 
  filter(str_detect(collocation, "\\bmora")) %>% 
  arrange(-count) %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("collocation",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.8 Konkordance (po povedih)

Ena možnost je tokenizacija po povedih, tako da lahko izvlečemo zgradbo iz posameznih povedi. To ima prednost, da lahko določim dolžino in morebitne druge lastnosti povedi, ki vsebuje naklonsko zgradbo, v primerjavi s povedmi brez naklonske zgradbe.

Nastavitev window = 1 določa, da želimo zajeti eno poved pred iskano zgradbo in eno po iskani zgradbi.

text <- slovcas$text
toks <- tokens(text, what = 'sentence')

“Keyword” je v tem primeru poved z iskano naklonsko zgradbo, “pre” je poved pred njo, “post” pa poved, ki ji sledi.

kwic_mora_poved <- as_tibble(kwic(
  # toks, phrase('\\bmora*\\b\\s\\b[a-z]*ti\\b'), # follow each other
  toks, phrase('\\bmora*\\b'), # there may be other words in between
  valuetype = 'regex', window = 1))

DT::datatable(kwic_mora_poved, 
              filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

To bi naredili še za druge naklonske zgradbe.

14.9 Konkordance (fraze)

Druga možnost izvleče besedne zveze iz jezikovnega gradiva (corpus). V tem primeru uporabljamo kwic() in kot vzorec (pattern) izberemo phrase().

morati2 <- kwic(slcas_corp, valuetype = "glob",
               pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)

morati2_kwic <- as_tibble(morati2)

morati2_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
treba2 <- kwic(slcas_corp, valuetype = "glob",
               pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)

treba2_kwic <- as_tibble(treba2)

treba2_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
potrebno2 <- kwic(slcas_corp, valuetype = "glob",
               pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)

potrebno2_kwic <- as_tibble(potrebno2)

potrebno2_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.9.1 časniki

Zgoraj smo že pridobili konkordance iz besednega seznama (tokens), ki smo organizirali po povedih (sentences) namesto po besedilih (doc_id). Konkordance so prikazovale tri povedi (keyword, pre- in post-).

Spodnji način najprej predvideva pretvorbo jezikovnega gradiva (corpus) v povedi (sentences), in sicer s funkcijo corpus_reshape(), ki je sestavni del programa quanteda. Na tak način prikazujemo naklonsko zgradbo kot keyword, funkcija kwic() pa ima nastavljeno široko okno, tj. window = 50.

slcas_corp_sent <- corpus_reshape(slcas_corp, to = "sentences", use_docvars = T)

Poizvedba: med naklonskim izrazom morati oz. treba in nedoločnikom so lahko tudi druge besede. Vsi izrazi so znotraj določene povedi.

morati3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)

treba3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50, 
               pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)

Prikaz konkordančne preglednice.

morati3_kwic <- as_tibble(morati3)

morati3_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
treba3_kwic <- as_tibble(treba3)

treba3_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
potrebno3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)

potrebno3_kwic <- as_tibble(potrebno3)

potrebno3_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.9.2 twitter

Pretvorba jezikovnega gradiva, tako da bo osnovna enota poved (sentences).

sltwit_corp_sent <- corpus_reshape(sltwit_corp, to = "sentences", use_docvars = T)

Poizvedba in prikaz konkordančne preglednice.

morati3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("mora* *ti", "mora* *či")), case_insensitive = T)

morati3tw_kwic <- as_tibble(morati3tw)

morati3tw_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
treba3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("treba *ti", "treba *či")), case_insensitive = T)

treba3tw_kwic <- as_tibble(treba3tw)

treba3tw_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
potrebno3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("potrebno *ti", "potrebno *či")), case_insensitive = T)

potrebno3tw_kwic <- as_tibble(potrebno3tw)

potrebno3tw_kwic %>% 
DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
imeti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("ima* za *ti", "ima* za *či", 
                                  "imel* za *ti", "imel* za *či")), case_insensitive = T)

imeti3tw_kwic <- as_tibble(imeti3tw)

imeti3tw_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
imeti3twr <- kwic(sltwit_corp_sent, valuetype = "regex", window = 50,
               pattern = phrase(c("\\bima*\\b \\bza\\b t$", 
                                  "\\bima*\\b \\bza\\b ti$")), case_insensitive = T)

imeti3twr_kwic <- as_tibble(imeti3twr)

imeti3twr_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 10, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
rabiti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("rabi* za *ti", "rabi* za *či")), case_insensitive = T)

rabiti3tw_kwic <- as_tibble(rabiti3tw)

rabiti3tw_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 3, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))
rabiti3twr <- kwic(sltwit_corp_sent, valuetype = "regex", window = 50,
               pattern = phrase(c("\\brab*\\b \\bza\\b t$", 
                                  "\\brab*\\b \\bza\\b ti$")), case_insensitive = T)

rabiti3twr_kwic <- as_tibble(rabiti3twr)

rabiti3twr_kwic %>% 
  DT::datatable(filter = "top", fillContainer = TRUE, 
              extensions = 'Buttons', "ColReorder", 
              options = list(pageLength = 3, 
                             colReorder = TRUE,
                             dom = 'Bfrtip',
    buttons = c('colvis','copy', 'csv', 'excel', 'pdf', 'print')
  )) %>% 
  formatStyle("keyword",
  target = 'row',
  backgroundColor = styleEqual(c(0, 1), c('gray30', 'lightblue')))

14.10 Deleži glagolov

V nadaljevanju uporabljajo konkordance iz prejšnjega odseka, ki vsebujejo zveze naklonskega glagola in nedoločnika kot, tj. phrase(). Izračunali bomo deleže nedoločnikov, ki spremljajo naklonski glagol morati in treba (in nekatere druge).

14.10.1 twitter

m1tw <- morati3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "morati", sample = "twitter")

m1tw %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
morati3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n >5) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

t1tw <- treba3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "treba", sample = "twitter")

t1tw %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
treba3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n > 2) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

p1tw <- potrebno3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "potrebno", sample = "twitter")

p1tw %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
potrebno3tw_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n > 2) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

14.10.2 časniki

m1 <- morati3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "morati", sample = "casniki")

m1 %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
morati3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n >15) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

t1 <- treba3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "treba", sample = "casniki")

t1 %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
treba3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n >5) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

p1 <- potrebno3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  mutate(modal = "potrebno", sample = "časniki")

p1 %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
potrebno3_kwic %>% 
  separate(keyword, into = c("modal", "inf"), sep = " ") %>% 
  count(inf, sort = T) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  filter(n >5) %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none")

14.10.3 vsa besedila

modalinf0 <- bind_rows(m1,t1,m1tw,t1tw) %>% 
  select(sample, modal, inf, n, procent, syl_inf) %>% 
  mutate(modal = tolower(modal), inf = tolower(inf))

modalinf0 %>% 
  DT::datatable(slcas_stat, fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
modalinf0_all <- modalinf0 %>% 
  group_by(modal, inf) %>%
  arrange(inf) %>% 
  summarise(freq = sum(n)) %>% 
  mutate(pct = round(100*freq / sum(freq), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  arrange(-pct)

modalinf0_all %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
# write_rds(modalinf0, "data/modalinf0.rds")
# write_csv(modalinf0, "data/modalinf0.csv")
# write_rds(modalinf0_all, "data/modalinf0_all.rds")
# write_csv(modalinf0_all, "data/modalinf0_all.csv")
modalinf <- rbind(m1,t1,p1,m1tw,t1tw,p1tw) %>% 
  select(sample, modal, inf, n, procent, syl_inf) %>% 
  mutate(modal = tolower(modal), inf = tolower(inf))

modalinf %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
modalinf_all <- modalinf %>% 
  group_by(modal, inf) %>%
  arrange(inf) %>% 
  summarise(freq = sum(n)) %>% 
  mutate(pct = round(100*freq / sum(freq), 2)) %>% 
  mutate(syl_inf = nsyllable::nsyllable(inf)) %>% 
  arrange(-pct)

modalinf_all %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
# write_rds(modalinf, "data/modalinf.rds")
# write_csv(modalinf, "data/modalinf.csv")
# write_rds(modalinf_all, "data/modalinf_all.rds")
# write_csv(modalinf_all, "data/modalinf_all.csv")

14.10.3.1 glmer

library(lme4)

m000 <- glmer(n ~ 1 + (1|sample), poisson, data = modalinf0)
m001 <- glmer(n ~ 1 + modal*syl_inf +
               (1|sample), poisson, data = modalinf0)
anova(m000, m001)
## Data: modalinf0
## Models:
## m000: n ~ 1 + (1 | sample)
## m001: n ~ 1 + modal * syl_inf + (1 | sample)
##      npar   AIC   BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## m000    2 12451 12461 -6223.6    12447                         
## m001    5 10621 10646 -5305.5    10611 1836.1  3  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(effects)
plot(allEffects(m001), multiline = T)

png("pictures/modalinf0_glmer.png")
plot(allEffects(m001), multiline = T)
dev.off()
## svg 
##   2

14.10.3.2 časniki

modalinf %>% 
  filter(sample == "časniki") %>% 
  top_n(30, procent) %>%
  ungroup() %>% 
  mutate(inf = reorder_within(inf, procent, paste0("(", procent, ")"), sep = " ")) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none") +
  facet_wrap(~ modal, scales = "free") # both scales are free (instead of free_x, free_y)

14.10.3.3 twitter

modalinf %>% 
  filter(sample == "twitter") %>% 
  top_n(30, procent) %>%
  ungroup() %>% 
  mutate(inf = reorder_within(inf, procent, paste0("(", procent, ")"), sep = " ")) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none") +
  facet_wrap(~ modal, scales = "free") # both scales are free (instead of free_x, free_y)

14.10.3.4 vsa

modalinf_all %>% 
  top_n(30, pct) %>%
  ungroup() %>%
  mutate(inf = reorder_within(inf, pct, paste0("(", pct, ")"), sep = " ")) %>% 
  ggplot(aes(pct, inf, fill = inf)) +
  geom_col() +
  scale_y_reordered() +
  theme(legend.position = "none") +
  facet_wrap(~ modal, scales = "free") +
  labs(x = "procent", y = "")

14.10.3.5 brez biti

modalinf %>% 
  filter(procent > 1) %>% 
  filter(inf != "biti") %>% 
  mutate(inf = fct_reorder(inf, n)) %>% 
  ggplot(aes(procent, inf, fill = inf)) +
  geom_col() +
  theme(legend.position = "none") +
  facet_wrap(~ modal, scales = "free_x")

modalinf_all %>% 
  filter(inf != "biti") %>% 
  top_n(30, pct) %>%
  ungroup() %>%
  mutate(inf = reorder_within(inf, pct, paste0("(", pct, ")"), sep = " ")) %>% 
  ggplot(aes(pct, inf, fill = inf)) +
  geom_col() +
  scale_y_reordered() +
  theme(legend.position = "none") +
  facet_wrap(~ modal, scales = "free") +
  labs(x = "procent", y = "")

14.10.4 Večkratno pojavljanje

Glagoli se pojavljajo enkrat ali večkrat:

x <- as_tibble(rbind(veckrat = c(212,82), enkrat = c(470-212,214-82))) %>% 
  rename(morati = V1, treba = V2)

chisq.test(x)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 2.4951, df = 1, p-value = 0.1142

14.10.5 imeti + nedoločnik

imeti3 <- kwic(slcas_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("ima* za *ti", "ima* za *či", 
                                  "imel* za *ti", "imel* za *či")), case_insensitive = T)

imeti3_kwic <- as_tibble(imeti3)
imeti3_kwic %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
imeti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("ima* za *ti", "ima* za *či", 
                                  "imel* za *ti", "imel* za *či",
                                  "ima* *ti", "ima* *či", 
                                  "imel* *ti", "imel* *či")), case_insensitive = T)

imeti3tw_kwic <- as_tibble(imeti3tw)
imeti3tw_kwic %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.10.6 rabiti + nedoločnik

rabiti3tw <- kwic(sltwit_corp_sent, valuetype = "glob", window = 50,
               pattern = phrase(c("rabi* za *ti", "rabi* za *či",
                                  "rabi* *ti", "rabi* *či")), case_insensitive = T)

rabiti3tw_kwic <- as_tibble(rabiti3tw)
rabiti3tw_kwic %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.11 Udpipe analiza

14.11.1 Load model

library(udpipe)
destfile = "slovenian-ssj-ud-2.5-191206.udpipe"

if(!file.exists(destfile)){
   language_model <- udpipe_download_model(language = "slovenian")
   ud_sl <- udpipe_load_model(language_model$file_model)
   } else {
  file_model = destfile
  ud_sl <- udpipe_load_model(file_model)
}

14.11.2 Annotate

library(udpipe)
slovcas_udpiped <- udpipe_annotate(ud_sl, slovcas$text, trace = FALSE)
slovcas_udpiped <- as.data.frame(slovcas_udpiped)

14.11.2.1 Save

# write_rds(slovcas_udpiped, "data/slovcas_udpiped.rds")
# write_csv(slovcas_udpiped, "data/slovcas_udpiped.csv")

14.11.3 Clean tweets

clean_tweet = gsub("&amp", " ", slovtwit$text)
clean_tweet = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", " ", clean_tweet)
clean_tweet = gsub("@\\w+", " ", clean_tweet)
# clean_tweet = gsub("[[:punct:]]", " ", clean_tweet)
clean_tweet = gsub("[[:digit:]]", " ", clean_tweet)
# Get rid of URLs
clean_tweet = gsub(
  "\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", "", clean_tweet)
# clean_tweet = gsub("https\\w+", " ", clean_tweet)
# clean_tweet = gsub("http\\w+", " ", clean_tweet)
clean_tweet = gsub("[ \t]{2,}", " ", clean_tweet)
clean_tweet = gsub("^\\s+|\\s+$", " ", clean_tweet) 

# Get rid of URLs
# clean_tweet <- str_replace_all(
#   clean_tweet,"http://t.co/[a-z,A-Z,0-9]*{8}","")

# Take out retweet header, there is only one
clean_tweet <- str_replace(clean_tweet,"RT @[a-z,A-Z]*: "," ")
# Get rid of hashtags
clean_tweet <- str_replace_all(clean_tweet,"#[a-z,A-Z]*"," ")
# Get rid of references to other screennames
clean_tweet <- str_replace_all(clean_tweet,"@[a-z,A-Z]*"," ")
 #get rid of unnecessary spaces
clean_tweet <- str_replace_all(clean_tweet," "," ")

slovtwit$text_cleaned <- clean_tweet

The above expression explained: ? optional space (f|ht) match “f” or “ht” tp match “tp” (s?) optionally match “s” if it’s there (://) match “://” (.) match every character (everything) up to [.|/] a period or a forward-slash (.) then everything after that

# regex "((http|ftp|https):\/\/[\w\-]+(\.[\w\-]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)"

14.11.4 Annotate tweets

slovtwit_udpiped <- udpipe_annotate(ud_sl, 
                                    slovtwit$text_cleaned, 
                                    trace = F)
slovtwit_udpiped <- as.data.frame(slovtwit_udpiped)

14.11.4.1 Save tweets

# write_rds(slovtwit_udpiped, "data/slovtwit_udpiped.rds")
# write_csv(slovtwit_udpiped, "data/slovtwit_udpiped.csv")

14.11.5 tidytext

tokenize_annotate = function(tbl){
  tbl %>% 
  unnest_tokens(word, token, drop = F) %>% 
  cbind_morphological(term = "feats",  
                      which = c("PronType","NumType","Poss","Reflex",
                                "Foreign","Abbr","Typo",
                                "Gender","Animacy","NounClass",
                                "Case","Number","Definite","Degree",
                                "VerbForm","Person","Tense","Mood",
                                "Aspect","Voice","Evident",
                                "Polarity","Polite","Clusivity")) %>% 
  mutate(txt = str_replace_all(sentence, "[:punct:]", "")) %>% 
  mutate(sentlen = quanteda::ntoken(txt)) %>% 
  mutate(syllables = nsyllable::nsyllable(txt)) %>% 
  mutate(types = quanteda::ntype(txt)) %>% 
  mutate(wordlen = syllables/sentlen) %>% 
  mutate(ttr = types/sentlen) %>% 
  select(-txt, -feats)
}
slovtwit_df <- slovtwit_udpiped %>% 
  tokenize_annotate() %>% mutate(language = "slv",
                                 texttype = "twitter")
slovcas_df <- slovcas_udpiped %>% 
  tokenize_annotate() %>% mutate(language = "slv",
                                 texttype = "news")
slovtxts <- bind_rows(slovtwit_df, slovcas_df)

14.11.5.1 Shrani ud

# write_rds(slovtxts, "data/slovtxts.rds")
# write_csv(slovtxts, "data/slovtxts.csv")

14.11.5.2 Odpri ud

# slovtxts <- read_rds("data/slovtxts.rds")

14.11.5.3 Sample ud

Majhen vzorec za oceno, kolikokrat je uporabljen morati v primeru zunanje nujnosti (ali drugih pomenov).

twitter100 <- slovtxts %>% 
  filter(texttype == "twitter") %>% 
  filter(lemma == "morati" | lemma == "treba") %>% 
  slice_sample(n = 100, replace = TRUE)
# writexl::write_xlsx(twitter100, "data/twitter100.xlsx")

14.11.6 Dependenca

slo_deprel <- slovtxts %>% 
  group_by(texttype) %>% 
  # filter(texttype == "news" | texttype == "twitter") %>% 
  count(dep_rel, sort = TRUE) %>% 
  mutate(pct = round(100*n/sum(n),2)) %>% 
  pivot_wider(names_from = texttype, values_from = c(n, pct)) %>% 
  mutate(across(everything(), ~ replace_na(.x, 0))) %>% 
  mutate(dep_rel = 
           str_replace(dep_rel, "0", "Unknown"))

slo_deprel %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.11.7 Vizualizacija

library(igraph)
library(ggraph)
library(ggplot2)

plot_annotation <- function(x, size = 3){
  stopifnot(is.data.frame(x) & all(c("sentence_id", "token_id", "head_token_id", "dep_rel", "token_id", "token", "lemma", "upos", "xpos", "feats") %in% colnames(x)))
  x <- x[!is.na(x$head_token_id), ]
  x <- x[x$sentence_id %in% min(x$sentence_id), ]
  edges <- x[x$head_token_id != 0, c("token_id", "head_token_id", "dep_rel")]
  edges$label <- edges$dep_rel
  g <- graph_from_data_frame(edges,
                             vertices = x[, c("token_id", "token", "lemma", "upos", "xpos", "feats")],
                             directed = TRUE)
  windowsFonts("Arial Narrow" = windowsFont("Arial"))
  ggraph(g, layout = "linear") +
    geom_edge_arc(ggplot2::aes(label = dep_rel, vjust = -0.20),
                  arrow = grid::arrow(length = unit(4, 'mm'), ends = "last", type = "closed"),
                  end_cap = ggraph::label_rect("wordswordswords"),
                  label_colour = "red", check_overlap = TRUE, label_size = size) +
    geom_node_label(ggplot2::aes(label = token), col = "darkgreen", size = size, fontface = "bold") +
    geom_node_text(ggplot2::aes(label = upos), nudge_y = -0.35, size = size) +
    theme_graph(base_family = "Arial Narrow") +
    labs(title = "udpipe output", subtitle = "tokenisation, parts of speech tagging & dependency relations")
}
# Slovenian
mytext = "Ali jo bodo morali zapustiti" %>% enc2utf8()
x = udpipe(mytext, "slovenian")
x3 = plot_annotation(x, size = 3)
x3

png("pictures/morati_zapustiti.png")
x3
dev.off()
## svg 
##   2
# Slovenian
mytext = "Ali jo bo treba zapustiti" %>% enc2utf8()
x = udpipe(mytext, "slovenian")
x4 = plot_annotation(x, size = 3)
x4

png("pictures/treba_zapustiti.png")
x4
dev.off()
## svg 
##   2

14.11.8 Predmet pred/za modal

predmet je NOUN ali PRON

x = slovtxts %>% 
  group_by(lemma) %>% 
  filter(lemma == "morati" | lemma == "treba" | lemma == "potreben")
x %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
pivot_by_verb_obj = function(tbl){
  tbl %>% 
      mutate(word_order = ifelse(token_id > head_token_id, 
                             "VO", 
                             "OV")) %>% 
  count(dep_rel, word_order) %>% 
  mutate(pct = round(100*n/sum(n),2)) %>% 
  pivot_wider(names_from = texttype, values_from = c(n, pct)) %>% 
  mutate(across(everything(), ~ replace_na(.x, 0))) %>% 
  mutate(dep_rel = 
           str_replace(dep_rel, "0", "Unknown")) %>% 
  select(-dep_rel)
}
vo_nominal_morati = slovtxts %>% 
  group_by(texttype) %>% 
  # filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
  filter(str_detect(sentence, "\\bmora*")) %>%
  filter(dep_rel == "obj" & 
           upos %in% c("NOUN", "PROPN")) %>% 
  pivot_by_verb_obj() %>% 
  mutate(word_class = "NOUN") %>% 
  mutate(modal = "morati")

vo_nominal_morati %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
vo_nominal_treba = slovtxts %>% 
  group_by(texttype) %>% 
  # filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
  filter(str_detect(sentence, "\\btreba\\b")) %>%
  filter(dep_rel == "obj" & 
           upos %in% c("NOUN", "PROPN")) %>% 
  pivot_by_verb_obj() %>% 
  mutate(word_class = "NOUN") %>% 
  mutate(modal = "treba")

vo_nominal_treba %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
vo_pronominal_morati = slovtxts %>% 
  group_by(texttype) %>% 
  # filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
  filter(str_detect(sentence, "\\bmora*")) %>%
  filter(dep_rel == "obj" & 
           upos %in% c("PRON")) %>% 
  pivot_by_verb_obj() %>% 
  mutate(word_class = "PRON") %>% 
  mutate(modal = "morati")

vo_pronominal_morati %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
vo_pronominal_treba = slovtxts %>% 
  group_by(texttype) %>% 
  # filter(str_detect(sentence, "\\bmora*|\\btreba\\b")) %>%
  filter(str_detect(sentence, "\\btreba\\b")) %>%
  filter(dep_rel == "obj" & 
           upos %in% c("PRON")) %>% 
  pivot_by_verb_obj() %>% 
  mutate(word_class = "PRON") %>% 
  mutate(modal = "treba")

vo_pronominal_treba %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
verb_object = bind_rows(vo_nominal_morati, 
                        vo_nominal_treba,
                        vo_pronominal_morati, 
                        vo_pronominal_treba)

verb_object %>% 
  # select(-pct_news,-pct_twitter) %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

treba, noun: več OV kot VO v twitterju v primerjavi s časniki.

verb_object[3:4,2:3]
## # A tibble: 2 x 2
##   n_news n_twitter
##    <dbl>     <dbl>
## 1    238       132
## 2    631       268
verb_object[3:4,2:3] %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  .
## X-squared = 3.9095, df = 1, p-value = 0.04801

treba, pron: več VO kot OV v twitterju v primerjavi s časniki.

verb_object[7:8,2:3]
## # A tibble: 2 x 2
##   n_news n_twitter
##    <dbl>     <dbl>
## 1    261        84
## 2     54        43
verb_object[7:8,2:3] %>% 
  chisq.test()
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  .
## X-squared = 13.803, df = 1, p-value = 0.000203
# library(esquisse)
# esquisser(verb_object)

ggplot(verb_object) +
 aes(x = word_order, y = pct_twitter, fill = modal) +
 geom_boxplot(shape = "circle") +
 scale_fill_hue(direction = 1) +
 theme_minimal()

ggplot(verb_object) +
 aes(x = word_order, y = pct_news, fill = modal) +
 geom_boxplot(shape = "circle") +
 scale_fill_hue(direction = 1) +
 theme_minimal()

ggplot(verb_object) +
 aes(x = word_class, y = pct_news, fill = word_order) +
 geom_col() +
 scale_fill_hue(direction = 1) +
 theme_minimal() +
 facet_wrap(vars(modal))

ggplot(verb_object) +
 aes(x = word_class, y = pct_twitter, fill = word_order) +
 geom_col() +
 scale_fill_hue(direction = 1) +
 theme_minimal() +
 facet_wrap(vars(modal))

14.12 Kwic udpipe

Cilj: analiza nedoločnikov z udpipe in funkcijo tokenize_annotate(), ki sem jo sestavil zgoraj.

Izbral sem tabelo, ki vsebuje le naklonske zgradbe morati /treba + nedoločnik.

kwic_ud <- modalinf0 %>% 
  pull(inf) %>% 
  udpipe_annotate(ud_sl, ., trace = F)

kwic_ud <- as.data.frame(kwic_ud)

kwic_ud <- kwic_ud %>% 
  tokenize_annotate()

kwic_ud_select <- kwic_ud %>% 
  select(token, lemma, upos, xpos, morph_aspect, syllables)

14.12.1 Združi

modalinf0_ud <- modalinf0 %>% 
  bind_cols(kwic_ud_select) %>% 
  select(-token, -lemma, -syllables)

14.12.2 Pregled napak

Kateri nedoločniki sploh niso nedoločniki in obratno?

modalinf0_ud %>% 
  filter(upos != "VERB" & upos != "AUX") %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.12.2.1 Shrani

write_lines(modalinf0_ud %>% pull(inf), "data/modalinf0_ud_inf.txt")

14.12.3 Obeliks xml

Zgoraj sem uporabil udpipe za določevanje oblikoslovnih lastnosti nedoločnikov.

Primerjam še z analizo programa OBELIKS, ki je analiziral nedoločnike iz datoteke data/modalinf0_ud_inf.txt. Obeliksova izhodna datoteka ima format xml. Datoteko moram pretvoriti. Pri tem mi pomaga library(xml2).

Poglej tudi tabele na spletni strani IJS za pretvorbo specifikacij josMSD: https://nl.ijs.si/jos/josMSD-en.html.

Parsing of xml documents with library(xml2): gastonsanchez

library(rvest)
library(xml2)

obeliks <- read_xml("data/modalinf0_ud_inf.xml")
obeliks
## {xml_document}
## <TEI xmlns="http://www.tei-c.org/ns/1.0">
## [1] <text>\n  <body>\n    <p>\n      <s>\n        <w msd="Gp-n" lemma="biti"> ...

We use the xml_length() to know how many elements or nodes are in the root node:

xml_length(obeliks)
## [1] 1

Only one node in this xml document.

I need the information msd from child5.

root = xml_root(obeliks)
child1 = xml_children(root)
child2 = xml_children(child1)
child3 = xml_children(child2)
child4 = xml_children(child3)
child5 = xml_children(child4)

# Access the children nodes of (root) node 1
xml_child(obeliks, search = 1)
## {xml_node}
## <text>
## [1] <body>\n  <p>\n    <s>\n      <w msd="Gp-n" lemma="biti">biti</w>\n    </ ...
# The function xml_attrs() gives you the attributes of a node. In this case, the node child5 has the attributes msd and lemma.
xml_attrs(child5)[1:3]
## [[1]]
##    msd  lemma 
## "Gp-n" "biti" 
## 
## [[2]]
##     msd   lemma 
##  "Ggnn" "imeti" 
## 
## [[3]]
##        msd      lemma 
##     "Ggdn" "narediti"
# Number of children nodes inside a given node. In this case it is 0.
xml_length(child5)[1:3]
## [1] 0 0 0
# move along the children nodes
xml_name(xml_children(child5)) # no name
## character(0)
xml_name(xml_children(child4))[1:3] # name = "w"
## [1] "w" "w" "w"
xml_name(xml_children(child3))[1:3] # name = "s"
## [1] "s" "s" "s"
xml_name(xml_children(child2))[1:3] # name = "p"
## [1] "p" "p" "p"
xml_name(xml_children(child1)) # name = "body"
## [1] "body"
xml_name(xml_children(root)) # name = "text"
## [1] "text"
title1 <- xml_child(child4, "w") # in this case: NA
cont <- xml_contents(title1) # in this case: 0
titletxt <- xml_text(title1) # in this case : NA

14.12.3.1 Save child5 to csv file:

child5char <- as.character(child5)
write_lines(child5char, "data/obeliks_child5char.txt")

14.12.3.2 Open child5char and parse into data frame.

obeliks_delim <- read_delim("data/obeliks_child5char.txt", 
                            delim = " ",
                            col_names = F) %>% clean_names() %>% 
  select(-x1) %>% 
  separate(x2, into = c("x2", "xpos2"), 
           sep = "=", extra = "merge") %>% 
  separate(x3, into = c("x3", "inf2"), 
           sep = "=", extra = "merge") %>% 
  separate(inf2, into = c("inf2", "x4"), 
           sep = ">", extra = "merge") %>%
  separate(x4, into = c("inf0", "x4"), 
           sep = "<", extra = "merge") %>% 
    mutate(xpos2 = str_remove_all(xpos2, '"'),
         inf2 = str_remove_all(inf2, '"')) %>% 
  select(xpos2, inf0)

14.12.3.3 Save data frame.

write_csv(obeliks_delim, "data/obeliks_delim.csv")

14.12.4 Združi kwic in obeliks

Združim oba podatkovna niza in v nadaljevanju popravljam napake, ki jih je naredil predvsem udpipe, v manjši meri obeliks. Slednji je trainiran na slovenska besedila.

# modalinf0_udobx

modalinf0_udoblx <- modalinf0_ud %>% 
  bind_cols(obeliks_delim) %>% 
  mutate(upos2 = case_when(
    str_detect(xpos2, "^Gg") ~ "VERB",
    str_detect(xpos2, "^Gp") ~ "AUX",
    TRUE ~ "UNK"
  )) %>% 
  mutate(morph_aspect2 = case_when(
    xpos2 == "Ggnn" ~ "Imp",
    xpos2 == "Ggdn" ~ "Perf",
    xpos2 == "Ggvn" ~ "Both",
    xpos2 == "Gp-n" ~ "Imp",
    TRUE ~ "UNK")) %>% 
    mutate(inf = str_replace(inf, "poceti", "početi"),
         inf = str_replace(inf, "bti", "biti"),
         inf = str_replace(inf, "povećavati", "povečavati"),
         inf = str_replace(inf, "prepricati", "prepričati"),
         inf = str_replace(inf, "splacati", "splačati"),
         inf = str_replace(inf, "spostovati", "spoštovati")) %>% 
  mutate(upos2 = case_when(
    upos == "VERB" & 
      upos2 == "UNK" ~ "VERB",
    TRUE ~ upos2
  )) %>% 
  mutate(upos2 = case_when(
    inf == "morebiti" ~ "UNK",
    inf == "biti" ~ "AUX",
    inf == "temeljiti" ~ "VERB",
    inf == "vesti" ~ "VERB",
    inf == "pasti" ~ "VERB",
    inf == "maksimizirati" ~ "VERB",
    inf == "početi" ~ "VERB",
    inf == "povečavati" ~ "VERB",
    inf == "prepričati" ~ "VERB",
    inf == "splačati" ~ "VERB",
    inf == "spoštovati" ~ "VERB",
    inf == "sesuti" ~ "VERB",
    TRUE ~ upos2
  )) %>% 
  mutate(morph_aspect2 = case_when(
    upos2 == "VERB" & 
      upos == "VERB" & 
      morph_aspect2 == "UNK" ~ morph_aspect,
    TRUE ~ morph_aspect2
  )) %>% 
  mutate(xpos2 = case_when(
    inf == "biti" ~ "Gp-n",
    inf == "sprejeti" ~ "Ggdn",
    inf == "reči" ~ "Ggdn",
    inf == "vesti" ~ "Ggnn",
    inf == "izraziti" ~ "Ggdn",
    inf == "zavzeti" ~ "Ggdn",
    inf == "pasti" ~ "Ggdn",
    inf == "temeljiti" ~ "Ggnn",
    inf == "maksimizirati" ~ "Ggvn",
    inf == "početi" ~ "Ggvn",
    inf == "prepričati" ~ "Ggdn",
    inf == "povečavati" ~ "Ggnn",
    inf == "razviti" ~ "Ggdn",
    inf == "splačati" ~ "Ggvn",
    inf == "sesuti" ~ "Ggdn",
    inf == "spoštovati" ~ "Ggnn",
    inf == "zaužiti" ~ "Ggdn",
    TRUE ~ xpos2
  )) %>% 
    mutate(morph_aspect2 = case_when(
    xpos2 == "Ggnn" ~ "Imp",
    xpos2 == "Ggdn" ~ "Perf",
    xpos2 == "Ggvn" ~ "Both",
    xpos2 == "Gp-n" ~ "Imp",
    TRUE ~ "UNK")) %>% 
  select(-upos, -xpos, -inf0, -morph_aspect) %>% 
  rename(upos = upos2, xpos = xpos2,
         morph_aspect = morph_aspect2)

14.12.4.1 Shrani združeno

write_rds(modalinf0_udoblx,
          "data/casniki_twitter_modalinf0_udoblx.rds")
write_csv(modalinf0_udoblx,
          "data/casniki_twitter_modalinf0_udoblx.csv")

14.12.5 Pretvorba josMSD sl>en

Najprej odpremo in prilagodimo tabelo josMSD, ki sem jo dobil na spletni strani IJS, josMSD: https://nl.ijs.si/jos/josMSD-en.html.

josmsd = read.delim2(
  "data/josMSD.tbl", 
  # stolpci so ločeni tabulatorsko
  sep = "\t", 
  # univerzalno kodiranje črk
  encoding = "UTF-8", 
  # dodamo imena stolpcev (lemma, word)
  col.names = c("koda", "msdslo", "msdeng"),
  # preberi kot črkovne nize
  stringsAsFactors = F) %>% 
  select(-koda)

Drugi korak: pretvori podatkovni niz v ločena znakovna niza.

# v tem stolpcu je osnovna oblika besede
lemma = as.character(josmsd$msdeng)
# v tem stolpcu je ena izmed besednih oblik
word = as.character(josmsd$msdslo)

S funkcijo tokens_replace() programa quanteda pretvorimo slovensko specifikacijo v angleško.

modalinf0_x <- 
  # zamenjava seznama (tokens_replace), s pomočjo pull()
  tokens_replace(
    modalinf0_udoblx %>% pull(xpos) %>% tokens(), 
    # obliko, ki jo želimo zamenjati
    pattern = word, 
    # zamenjava
    replacement = lemma, 
    # pazi na začetnico
    case_insensitive = FALSE, 
    # natančno ujemanje oblik
    valuetype = "fixed")

# zdaj imamo angleško specifikacijo (če je program našel zamenjavo)
modalinf0_x  %>% head(3)
## Tokens consisting of 3 documents.
## text1 :
## [1] "Va-n"
## 
## text2 :
## [1] "Vmpn"
## 
## text3 :
## [1] "Vmen"
modalinf0_udoblx$xpos2 <- as.character(modalinf0_x)
modalinf0_udoblx %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

14.12.5.1 shrani xpos eng

write_rds(modalinf0_udoblx,
          "data/casniki_twitter_modalinf0_udoblx.rds")
write_csv(modalinf0_udoblx,
          "data/casniki_twitter_modalinf0_udoblx.csv")

14.12.5.2 Odpri

# modalinf0_udoblx <- 
#   read_rds("data/casniki_twitter_modalinf0_udoblx.rds")

14.12.6 Oblikoslovje nedoločnikov

aspekt <- modalinf0_udoblx %>% 
  filter(morph_aspect != "UNK") %>% 
  group_by(modal) %>% 
  count(morph_aspect) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  pivot_wider(names_from = "modal", 
              values_from = c("n", "procent"))

aspekt %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))

Brez biti:

aspekt_brez_biti <- modalinf0_udoblx %>% 
  filter(morph_aspect != "UNK") %>% 
  filter(inf != "biti") %>% 
  group_by(modal) %>% 
  count(morph_aspect) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  pivot_wider(names_from = "modal", 
              values_from = c("n", "procent"))

aspekt_brez_biti %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
chisq.test(aspekt[,2:3])
## 
##  Pearson's Chi-squared test
## 
## data:  aspekt[, 2:3]
## X-squared = 0.27055, df = 2, p-value = 0.8735
chisq.test(aspekt_brez_biti[,2:3])
## 
##  Pearson's Chi-squared test
## 
## data:  aspekt_brez_biti[, 2:3]
## X-squared = 0.17605, df = 2, p-value = 0.9157

Namesto pojavnic različnice (types).

aspect_types <- modalinf0_udoblx %>% 
  distinct(modal, inf, .keep_all = T) %>% 
  filter(morph_aspect != "UNK") %>% 
  # filter(inf != "biti") %>% 
  group_by(modal) %>% 
  count(morph_aspect) %>% 
  mutate(procent = round(100*n / sum(n), 2)) %>% 
  pivot_wider(names_from = "modal", 
              values_from = c("n", "procent"))

aspect_types %>% 
  DT::datatable(fillContainer = TRUE, filter = "top",
                options = list(pageLength = 4))
chisq.test(aspect_types[,2:3])
## 
##  Pearson's Chi-squared test
## 
## data:  aspect_types[, 2:3]
## X-squared = 0.16298, df = 2, p-value = 0.9217